{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第一步：收集数据\n",
    "https://grouplens.org/datasets/movielens/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第二步：准备数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "f:\\Anaconda3\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings_df = pd.read_csv('ml-latest-small/ratings.csv')\n",
    "movies_df = pd.read_csv('ml-latest-small/movies.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>99999</th>\n",
       "      <td>671</td>\n",
       "      <td>6268</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1065579370</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100000</th>\n",
       "      <td>671</td>\n",
       "      <td>6269</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1065149201</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100001</th>\n",
       "      <td>671</td>\n",
       "      <td>6365</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1070940363</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100002</th>\n",
       "      <td>671</td>\n",
       "      <td>6385</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1070979663</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100003</th>\n",
       "      <td>671</td>\n",
       "      <td>6565</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1074784724</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        userId  movieId  rating   timestamp\n",
       "99999      671     6268     2.5  1065579370\n",
       "100000     671     6269     4.0  1065149201\n",
       "100001     671     6365     4.0  1070940363\n",
       "100002     671     6385     2.5  1070979663\n",
       "100003     671     6565     3.5  1074784724"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings_df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9120</th>\n",
       "      <td>162672</td>\n",
       "      <td>Mohenjo Daro (2016)</td>\n",
       "      <td>Adventure|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9121</th>\n",
       "      <td>163056</td>\n",
       "      <td>Shin Godzilla (2016)</td>\n",
       "      <td>Action|Adventure|Fantasy|Sci-Fi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9122</th>\n",
       "      <td>163949</td>\n",
       "      <td>The Beatles: Eight Days a Week - The Touring Y...</td>\n",
       "      <td>Documentary</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9123</th>\n",
       "      <td>164977</td>\n",
       "      <td>The Gay Desperado (1936)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9124</th>\n",
       "      <td>164979</td>\n",
       "      <td>Women of '69, Unboxed</td>\n",
       "      <td>Documentary</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId                                              title  \\\n",
       "9120   162672                                Mohenjo Daro (2016)   \n",
       "9121   163056                               Shin Godzilla (2016)   \n",
       "9122   163949  The Beatles: Eight Days a Week - The Touring Y...   \n",
       "9123   164977                           The Gay Desperado (1936)   \n",
       "9124   164979                              Women of '69, Unboxed   \n",
       "\n",
       "                               genres  \n",
       "9120          Adventure|Drama|Romance  \n",
       "9121  Action|Adventure|Fantasy|Sci-Fi  \n",
       "9122                      Documentary  \n",
       "9123                           Comedy  \n",
       "9124                      Documentary  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies_df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies_df['movieRow'] = movies_df.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>movieRow</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9120</th>\n",
       "      <td>162672</td>\n",
       "      <td>Mohenjo Daro (2016)</td>\n",
       "      <td>Adventure|Drama|Romance</td>\n",
       "      <td>9120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9121</th>\n",
       "      <td>163056</td>\n",
       "      <td>Shin Godzilla (2016)</td>\n",
       "      <td>Action|Adventure|Fantasy|Sci-Fi</td>\n",
       "      <td>9121</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9122</th>\n",
       "      <td>163949</td>\n",
       "      <td>The Beatles: Eight Days a Week - The Touring Y...</td>\n",
       "      <td>Documentary</td>\n",
       "      <td>9122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9123</th>\n",
       "      <td>164977</td>\n",
       "      <td>The Gay Desperado (1936)</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>9123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9124</th>\n",
       "      <td>164979</td>\n",
       "      <td>Women of '69, Unboxed</td>\n",
       "      <td>Documentary</td>\n",
       "      <td>9124</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieId                                              title  \\\n",
       "9120   162672                                Mohenjo Daro (2016)   \n",
       "9121   163056                               Shin Godzilla (2016)   \n",
       "9122   163949  The Beatles: Eight Days a Week - The Touring Y...   \n",
       "9123   164977                           The Gay Desperado (1936)   \n",
       "9124   164979                              Women of '69, Unboxed   \n",
       "\n",
       "                               genres  movieRow  \n",
       "9120          Adventure|Drama|Romance      9120  \n",
       "9121  Action|Adventure|Fantasy|Sci-Fi      9121  \n",
       "9122                      Documentary      9122  \n",
       "9123                           Comedy      9123  \n",
       "9124                      Documentary      9124  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies_df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 筛选 movies_df 中的特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies_df = movies_df[['movieRow', 'movieId', 'title']]\n",
    "movies_df.to_csv('moviesProcessed.csv', index=False, header=True, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieRow</th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9120</th>\n",
       "      <td>9120</td>\n",
       "      <td>162672</td>\n",
       "      <td>Mohenjo Daro (2016)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9121</th>\n",
       "      <td>9121</td>\n",
       "      <td>163056</td>\n",
       "      <td>Shin Godzilla (2016)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9122</th>\n",
       "      <td>9122</td>\n",
       "      <td>163949</td>\n",
       "      <td>The Beatles: Eight Days a Week - The Touring Y...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9123</th>\n",
       "      <td>9123</td>\n",
       "      <td>164977</td>\n",
       "      <td>The Gay Desperado (1936)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9124</th>\n",
       "      <td>9124</td>\n",
       "      <td>164979</td>\n",
       "      <td>Women of '69, Unboxed</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      movieRow  movieId                                              title\n",
       "9120      9120   162672                                Mohenjo Daro (2016)\n",
       "9121      9121   163056                               Shin Godzilla (2016)\n",
       "9122      9122   163949  The Beatles: Eight Days a Week - The Touring Y...\n",
       "9123      9123   164977                           The Gay Desperado (1936)\n",
       "9124      9124   164979                              Women of '69, Unboxed"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies_df.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 将 ratings_df 中的 movieId 替换为行号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings_df = pd.merge(ratings_df, movies_df, on='movieId')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>movieRow</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>31</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1260759144</td>\n",
       "      <td>30</td>\n",
       "      <td>Dangerous Minds (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7</td>\n",
       "      <td>31</td>\n",
       "      <td>3.0</td>\n",
       "      <td>851868750</td>\n",
       "      <td>30</td>\n",
       "      <td>Dangerous Minds (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>31</td>\n",
       "      <td>31</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1273541953</td>\n",
       "      <td>30</td>\n",
       "      <td>Dangerous Minds (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>32</td>\n",
       "      <td>31</td>\n",
       "      <td>4.0</td>\n",
       "      <td>834828440</td>\n",
       "      <td>30</td>\n",
       "      <td>Dangerous Minds (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>36</td>\n",
       "      <td>31</td>\n",
       "      <td>3.0</td>\n",
       "      <td>847057202</td>\n",
       "      <td>30</td>\n",
       "      <td>Dangerous Minds (1995)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating   timestamp  movieRow                   title\n",
       "0       1       31     2.5  1260759144        30  Dangerous Minds (1995)\n",
       "1       7       31     3.0   851868750        30  Dangerous Minds (1995)\n",
       "2      31       31     4.0  1273541953        30  Dangerous Minds (1995)\n",
       "3      32       31     4.0   834828440        30  Dangerous Minds (1995)\n",
       "4      36       31     3.0   847057202        30  Dangerous Minds (1995)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings_df = ratings_df[['userId', 'movieRow', 'rating']]\n",
    "ratings_df.to_csv('ratingsProcessed.csv', index = False, header=True, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieRow</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>30</td>\n",
       "      <td>2.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7</td>\n",
       "      <td>30</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>31</td>\n",
       "      <td>30</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>32</td>\n",
       "      <td>30</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>36</td>\n",
       "      <td>30</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieRow  rating\n",
       "0       1        30     2.5\n",
       "1       7        30     3.0\n",
       "2      31        30     4.0\n",
       "3      32        30     4.0\n",
       "4      36        30     3.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 创建电影评分矩阵 rating 和 评分记录矩阵 record"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "userNo = ratings_df['userId'].max()+1\n",
    "movieNo = ratings_df['movieRow'].max()+1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "672"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "userNo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9123"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movieNo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processed 5000, 95004 left\n",
      "processed 10000, 90004 left\n",
      "processed 15000, 85004 left\n",
      "processed 20000, 80004 left\n",
      "processed 25000, 75004 left\n",
      "processed 30000, 70004 left\n",
      "processed 35000, 65004 left\n",
      "processed 40000, 60004 left\n",
      "processed 45000, 55004 left\n",
      "processed 50000, 50004 left\n",
      "processed 55000, 45004 left\n",
      "processed 60000, 40004 left\n",
      "processed 65000, 35004 left\n",
      "processed 70000, 30004 left\n",
      "processed 75000, 25004 left\n",
      "processed 80000, 20004 left\n",
      "processed 85000, 15004 left\n",
      "processed 90000, 10004 left\n",
      "processed 95000, 5004 left\n",
      "processed 100000, 4 left\n"
     ]
    }
   ],
   "source": [
    "rating = np.zeros((movieNo, userNo))\n",
    "\n",
    "flag = 0\n",
    "ratings_df_length = np.shape(ratings_df)[0]\n",
    "\n",
    "for index, row in ratings_df.iterrows():\n",
    "    rating[int(row['movieRow']), int(row['userId'])] = row['rating']\n",
    "    flag += 1\n",
    "    if flag % 5000 == 0:\n",
    "        print('processed %d, %d left' % (flag, ratings_df_length-flag))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "record = rating>0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[False, False, False, ..., False,  True,  True],\n",
       "       [False, False, False, ..., False, False, False],\n",
       "       [False, False, False, ..., False, False, False],\n",
       "       ...,\n",
       "       [False, False, False, ..., False, False, False],\n",
       "       [False, False, False, ..., False, False, False],\n",
       "       [False, False, False, ..., False, False, False]])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "record"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "record = np.array(record, dtype=int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 1, 1],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "record"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# 第三步：构建模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalizeRatings(rating, record):\n",
    "    m, n = rating.shape\n",
    "    rating_mean = np.zeros((m, 1))\n",
    "    rating_norm = np.zeros((m, n))\n",
    "    for i in range(m):\n",
    "        idx = record[i, :] !=0\n",
    "        rating_mean[i] = np.mean(rating[i, idx])\n",
    "        rating_norm[i, idx] -= rating_mean[i]\n",
    "    return rating_norm, rating_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "f:\\Anaconda3\\lib\\site-packages\\numpy\\core\\fromnumeric.py:2920: RuntimeWarning: Mean of empty slice.\n",
      "  out=out, **kwargs)\n",
      "f:\\Anaconda3\\lib\\site-packages\\numpy\\core\\_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  ret = ret.dtype.type(ret / rcount)\n"
     ]
    }
   ],
   "source": [
    "rating_norm, rating_mean = normalizeRatings(rating, record)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_norm = np.nan_to_num(rating_norm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
       "        -3.87246964, -3.87246964],\n",
       "       [ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
       "         0.        ,  0.        ],\n",
       "       [ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
       "         0.        ,  0.        ],\n",
       "       ...,\n",
       "       [ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
       "         0.        ,  0.        ],\n",
       "       [ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
       "         0.        ,  0.        ],\n",
       "       [ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
       "         0.        ,  0.        ]])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_norm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_mean = np.nan_to_num(rating_mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[3.87246964],\n",
       "       [3.40186916],\n",
       "       [3.16101695],\n",
       "       ...,\n",
       "       [3.        ],\n",
       "       [0.        ],\n",
       "       [5.        ]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_features = 10\n",
    "X_parameters = tf.Variable(tf.random_normal([movieNo, num_features], stddev=0.35))\n",
    "Theta_paramters = tf.Variable(tf.random_normal([userNo, num_features], stddev=0.35))\n",
    "loss = 1/2 * tf.reduce_sum(((tf.matmul(X_parameters, Theta_paramters, transpose_b=True) - rating_norm)*record)**2) + \\\n",
    "    1/2 * (tf.reduce_sum(X_parameters**2) + tf.reduce_sum(Theta_paramters**2))\n",
    "optimizer = tf.train.AdamOptimizer()\n",
    "train = optimizer.minimize(loss)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第四步：训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.summary.scalar('loss', loss)\n",
    "summaryMerged = tf.summary.merge_all()\n",
    "filename = './movie_tensorboard'\n",
    "writer = tf.summary.FileWriter(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "sess = tf.Session()\n",
    "init = tf.global_variables_initializer()\n",
    "sess.run(init)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step: 0  train loss:0.10730  test loss:11.49665\n",
      "step: 100  train loss:0.10049  test loss:11.35875\n",
      "step: 200  train loss:0.07719  test loss:10.06480\n",
      "step: 300  train loss:0.04130  test loss:7.07488\n",
      "step: 400  train loss:0.01679  test loss:4.21624\n",
      "step: 500  train loss:0.00734  test loss:2.57023\n",
      "step: 600  train loss:0.00445  test loss:1.79445\n",
      "step: 700  train loss:0.00351  test loss:1.41947\n",
      "step: 800  train loss:0.00312  test loss:1.21572\n",
      "step: 900  train loss:0.00290  test loss:1.09030\n",
      "step: 1000  train loss:0.00275  test loss:1.00487\n",
      "step: 1100  train loss:0.00262  test loss:0.94221\n",
      "step: 1200  train loss:0.00252  test loss:0.89379\n",
      "step: 1300  train loss:0.00242  test loss:0.85495\n",
      "step: 1400  train loss:0.00234  test loss:0.82294\n",
      "step: 1500  train loss:0.00226  test loss:0.79602\n",
      "step: 1600  train loss:0.00219  test loss:0.77308\n",
      "step: 1700  train loss:0.00213  test loss:0.75333\n",
      "step: 1800  train loss:0.00207  test loss:0.73625\n",
      "step: 1900  train loss:0.00201  test loss:0.72144\n",
      "step: 2000  train loss:0.00196  test loss:0.70858\n",
      "step: 2100  train loss:0.00192  test loss:0.69745\n",
      "step: 2200  train loss:0.00187  test loss:0.68783\n",
      "step: 2300  train loss:0.00183  test loss:0.67957\n",
      "step: 2400  train loss:0.00179  test loss:0.67252\n",
      "step: 2500  train loss:0.00176  test loss:0.66657\n",
      "step: 2600  train loss:0.00172  test loss:0.66160\n",
      "step: 2700  train loss:0.00169  test loss:0.65754\n",
      "step: 2800  train loss:0.00166  test loss:0.65431\n",
      "step: 2900  train loss:0.00163  test loss:0.65184\n"
     ]
    }
   ],
   "source": [
    "penalty = movieNo*userNo\n",
    "\n",
    "for i in range(3000):\n",
    "    l, _, movie_summary = sess.run([loss, train, summaryMerged])\n",
    "    if i%100 == 0:\n",
    "        Current_X_parameters, Current_Theta_parameters = sess.run([X_parameters, Theta_paramters])\n",
    "        predicts = np.dot(Current_X_parameters,Current_Theta_parameters.T) + rating_mean\n",
    "        errors = np.mean((predicts - rating)**2)\n",
    "        print('step:', i, ' train loss:%.5f' % (l/penalty), ' test loss:%.5f' % errors)\n",
    "    writer.add_summary(movie_summary, i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "训练完成之后，打开 cmd，切换到 tensorboard 保存数据的目录下，运行 tensorboard --logdir=./，然后在浏览器中输入 127.0.0.1:6006 即可看到图形化的训练结果。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第五步：评估模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "Current_X_parameters, Current_Theta_parameters = sess.run([X_parameters, Theta_paramters])\n",
    "predicts = np.dot(Current_X_parameters,Current_Theta_parameters.T) + rating_mean\n",
    "errors = np.mean((predicts - rating)**2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6500955952713876"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "errors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第六步：构建完整的电影推荐系统"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "您要向哪位用户进行推荐？请输入用户编号：666\n",
      "==============================为该用户推荐的评分最高的20部电影是：===============================\n",
      "评分：2.89, 电影名：The Biggest Fan (2002)\n",
      "评分：2.88, 电影名：Dorian Blues (2004)\n",
      "评分：2.87, 电影名：Latter Days (2003)\n",
      "评分：2.87, 电影名：Long-Term Relationship (2006)\n",
      "评分：2.87, 电影名：Three of Hearts (1993)\n",
      "评分：2.87, 电影名：Back Soon (2007)\n",
      "评分：2.86, 电影名：Zerophilia (2005)\n",
      "评分：2.86, 电影名：Big Eden (2000)\n",
      "评分：2.86, 电影名：FAQs (2005)\n",
      "评分：2.86, 电影名：Curiosity of Chance, The (2006)\n",
      "评分：2.86, 电影名：Mambo Italiano (2003)\n",
      "评分：2.86, 电影名：On the Edge (2001)\n",
      "评分：2.86, 电影名：Touch of Pink (2004)\n",
      "评分：2.86, 电影名：Dancing in September (2000)\n",
      "评分：2.86, 电影名：Almost Normal (2005)\n",
      "评分：2.85, 电影名：I Think I Do (1997)\n",
      "评分：2.85, 电影名：Clockstoppers (2002)\n",
      "评分：2.85, 电影名：Trip, The (2002)\n",
      "评分：2.85, 电影名：Shining Through (1992)\n",
      "评分：2.85, 电影名：Shelter (2007)\n"
     ]
    }
   ],
   "source": [
    "user_id = input('您要向哪位用户进行推荐？请输入用户编号：')\n",
    "\n",
    "sortedResult = predicts[:, int(user_id)].argsort()[::-1]\n",
    "\n",
    "idx = 0\n",
    "print('为该用户推荐的评分最高的20部电影是：'.center(80, '='))\n",
    "for i in sortedResult:\n",
    "    print('评分：%.2f, 电影名：%s' % (predicts[i, int(user_id)], movies_df.iloc[i]['title']))\n",
    "    idx += 1\n",
    "    if idx == 20: break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
